Set up

source("../functions.R")
tol12qualitative=c("#332288", "#6699CC", "#88CCEE", "#44AA99", "#117733", "#999933", "#DDCC77", "#661100", "#CC6677", "#AA4466", "#882255", "#AA4499")
if (!file.exists("output")) {
  system("mkdir output")
}
parallel = TRUE
ncores = 10

Load packages

if (!require(DCARS)) {
  library(devtools)
  devtools::install_github("shazanfar/DCARS")
}
library(DCARS)
library(ggplot2)
library(gplots)
library(SingleCellExperiment)
library(scater)
library(scran)
library(reshape)
library(scattermore)
library(dynamicTreeCut)
library(ComplexHeatmap)
library(GO.db)
library(org.Mm.eg.db)
library(stringr)
library(matrixStats)
library(ggforce)
library(patchwork)
library(ggpubr)
library(cowplot)
library(parallel)
library(GGally)
library(corrplot)
library(UpSetR)

Process MOB data from URL

if (!file.exists("output/counts_coords.RData")) {
  
  # data file URL https://www.spatialresearch.org/wp-content/uploads/2016/07/Rep11_MOB_count_matrix-1.tsv
  # might take a minute to download
  counts_raw = read.delim(url("https://www.spatialresearch.org/wp-content/uploads/2016/07/Rep11_MOB_count_matrix-1.tsv"), header = TRUE, row.names = 1)
  # counts_raw = read.delim(file = "Rep11_MOB_count_matrix-1.tsv", header = TRUE, row.names = 1)
  dim(counts_raw)
  
  coords_raw = do.call(rbind, strsplit(rownames(counts_raw), "x"))
  # xy plane only
  
  coords = apply(coords_raw, 1:2, as.numeric)
  colnames(coords) <- c("x","y")
  rownames(coords) <- rownames(counts_raw)
  
  plot(coords[,"x"], coords[,"y"])
  
  counts = t(counts_raw)
  
  sce = SingleCellExperiment(assays = list(counts = counts),
                             colData = coords)
  sce <- scater::normalize(sce)
  
  var.fit <- trendVar(sce, parametric=TRUE, loess.args=list(span=0.3), use.spikes = FALSE)
  var.out <- decomposeVar(sce, var.fit)
  
  pdf(file = "output/HVG_selection.pdf", height = 8, width = 8)
  plot(var.out$mean, var.out$total, pch=16, cex=0.6, xlab="Mean log-expression", 
       ylab="Variance of log-expression")
  curve(var.fit$trend(x), col="dodgerblue", lwd=2, add=TRUE)
  
  seqvals = seq(min(var.out$mean), max(var.out$mean), length.out = 1000)
  peakExp = seqvals[which.max(var.fit$trend(seqvals))]
  
  hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$mean > peakExp),]
  nrow(hvg.out)
  hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),] 
  points(var.out$mean[which(var.out$FDR <= 0.05 & var.out$mean > peakExp)], 
         var.out$total[which(var.out$FDR <= 0.05 & var.out$mean > peakExp)], col="red", pch=16)
  abline(v = peakExp, lty = 2, col = "black")
  dev.off()
  
  head(hvg.out)
  
  HVG = sort(rownames(hvg.out))
  
  expr = logcounts(sce)
  
  save(counts, coords, sce, HVG, expr, file = "output/counts_coords.RData")
} else {
  load("output/counts_coords.RData")
}

W = weightMatrix_nD(coords, span = 0.05)

Test for spatial differential expression and remove

This takes a long time (approx 2 hours) to run.

#### test for spatial differential expression by using weightedMean
# results in a list of non-spatially DE genes
samepairs = cbind(rownames(expr), rownames(expr))
rownames(samepairs) = samepairs[,1]

if (!file.exists("output/DE_stats_all.Rds")) {
  DE_stats_all = DCARSacrossNetwork(expr,
                                    samepairs,
                                    W = W,
                                    weightedConcordanceFunction = weightedMeanMatrixStats,
                                    extractTestStatisticOnly = TRUE,
                                    niter = 1000,
                                    verbose = FALSE)
  saveRDS(DE_stats_all, file = "output/DE_stats_all.Rds")
} else {
  DE_stats_all = readRDS("output/DE_stats_all.Rds")
}

if (!file.exists("output/DE_sampled_permstats.Rdata")) {
  # Does globalCor relate to the null distribution for genes?
  set.seed(500)
  globalMean = rowMeans(expr)
  pairs_sampled = samepairs[DCARS::stratifiedSample(globalMean, length = 100),]
  
  sampled_permstats = DCARSacrossNetwork(expr,
                                         pairs_sampled,
                                         W = W,
                                         weightedConcordanceFunction = weightedMeanMatrixStats,
                                         extractPermutationTestStatistics = TRUE,
                                         niter = 1000,
                                         verbose = TRUE)
  save(sampled_permstats, pairs_sampled, globalMean, file = "output/DE_sampled_permstats.Rdata")
} else {
  load("output/DE_sampled_permstats.Rdata")
}
plot(globalMean[rownames(pairs_sampled)], unlist(lapply(unlist(sampled_permstats, recursive = FALSE),quantile, 0.99)))
points(globalMean[rownames(pairs_sampled)], unlist(lapply(unlist(sampled_permstats, recursive = FALSE),quantile, 0.95)), col = "red")
points(globalMean[rownames(pairs_sampled)], unlist(lapply(unlist(sampled_permstats, recursive = FALSE),quantile, 0.90)), col = "blue")

# looks like there is a general relationship between global mean and the null distribution, but this does not appear to be a strong enough association to predict null distribution by just globalMean
# will need to run full permutation testing

if (!file.exists("output/DE_n100_permstats.Rds")) {
  set.seed(500)
  if (!parallel) {
    n100_permstats = DCARSacrossNetwork(expr,
                                        samepairs,
                                        W = W,
                                        weightedConcordanceFunction = weightedMeanMatrixStats,
                                        extractPermutationTestStatistics = TRUE,
                                        niter = 100,
                                        verbose = TRUE)
  } else {
    split_df = split.data.frame(samepairs, rep(1:ncores, length.out = nrow(samepairs)))
    names(split_df) <- NULL
    n100_permstats = mclapply(split_df, function(s) {
      res = DCARSacrossNetwork(expr,
                               s,
                               W = W,
                               weightedConcordanceFunction = weightedMeanMatrixStats,
                               extractPermutationTestStatistics = TRUE,
                               niter = 100,
                               verbose = TRUE)
      res = lapply(res, unlist)
      names(res) <- rownames(s)
      return(res)
    })
    n100_permstats = unlist(n100_permstats, recursive = FALSE)
    n100_permstats <- n100_permstats[rownames(samepairs)]
  }
  saveRDS(n100_permstats, file = "output/DE_n100_permstats.Rds")
} else {
  n100_permstats = readRDS("output/DE_n100_permstats.Rds")
}

n100_pval = sapply(names(DE_stats_all), function(gene){
  mean(unlist(n100_permstats[[gene]]) >= DE_stats_all[gene])
})

# recalculate p-value for those with pval < 0.1 with 1000 iter
# aim is to remove these genes at p-value < 0.05 level

if (!file.exists("output/DE_n1000_pval.Rds")) {
  set.seed(500)
  if (!parallel) {
    n1000_pval = DCARSacrossNetwork(expr,
                                   samepairs[n100_pval < 0.1,],
                                   W = W,
                                   weightedConcordanceFunction = weightedMeanMatrixStats,
                                   niter = 1000,
                                   verbose = TRUE)
  } else {
    split_df = split.data.frame(samepairs[n100_pval < 0.1,], rep(1:ncores, length.out = nrow(samepairs[n100_pval < 0.1,])))
    names(split_df) <- NULL
    n1000_pval = mclapply(split_df, function(s) {
      res = DCARSacrossNetwork(expr,
                               s,
                               W = W,
                               weightedConcordanceFunction = weightedMeanMatrixStats,
                               niter = 1000,
                               verbose = TRUE)
      res = lapply(res, unlist)
      names(res) <- rownames(s)
      return(res)
    })
    n1000_pval = unlist(n1000_pval)
    n1000_pval <- n1000_pval[rownames(samepairs[n100_pval<0.1,])]
  } 
  saveRDS(n1000_pval, file = "output/DE_n1000_pval.Rds")
} else {
  n1000_pval = readRDS("output/DE_n1000_pval.Rds")
}
 
DE_pval = n100_pval
DE_pval[names(n1000_pval)] <- n1000_pval

DE_fdr = p.adjust(DE_pval, method = "BH")

nonDEgenes_fdr = names(which(DE_fdr > 0.05))
saveRDS(nonDEgenes_fdr, file = "output/nonDEgenes_fdr.Rds")
nonDEgenes = names(which(DE_pval > 0.05))
saveRDS(nonDEgenes, file = "output/nonDEgenes.Rds")

Perform scHOT spatial correlation testing

pairs = t(combn(intersect(HVG, nonDEgenes),2))
rownames(pairs) <- apply(pairs,1,paste0,collapse = "_")

dim(pairs)
## [1] 903   2
if (!file.exists("output/sampled_permstats.Rdata")) {
  set.seed(500)
  # Does globalCor relate to the null distribution for genes?
  globalCor = apply(pairs, 1, function(x) cor(counts[x[1],], counts[x[2],], method = "spearman"))
  pairs_sampled = pairs[DCARS::stratifiedSample(globalCor, length = 200),]
  
  if (!parallel) {
    sampled_permstats = DCARSacrossNetwork(counts,
                                           pairs_sampled,
                                           W = W,
                                           weightedConcordanceFunction = weightedSpearman,
                                           extractPermutationTestStatistics = TRUE,
                                           niter = 1000,
                                           verbose = TRUE)
  } else {
    split_df = split.data.frame(pairs_sampled, rep(1:ncores, length.out = nrow(pairs_sampled)))
    names(split_df) <- NULL
    sampled_permstats = mclapply(split_df, function(s) {
      res = DCARSacrossNetwork(counts,
                               s,
                               W = W,
                               weightedConcordanceFunction = weightedSpearman,
                               extractPermutationTestStatistics = TRUE,
                               niter = 1000,
                               verbose = TRUE)
      res = lapply(res, unlist)
      names(res) <- rownames(s)
      return(res)
    })
    sampled_permstats = unlist(sampled_permstats, recursive = FALSE)
    sampled_permstats <- sampled_permstats[rownames(pairs_sampled)]
  }
  save(sampled_permstats, pairs_sampled, globalCor, file = "output/sampled_permstats.Rdata")
} else {
  load("output/sampled_permstats.Rdata")
}

permstatsDF = data.frame(
  genepair = rep(names(sampled_permstats), times = unlist(lapply(sampled_permstats, function(x) length(unlist(x))))), 
  stat = unlist(sampled_permstats)
)
permstatsDF$globalCor = globalCor[as.character(permstatsDF$genepair)]

df_99 = data.frame(
  genepair = names(sampled_permstats),
  globalCor = globalCor[names(sampled_permstats)],
  stat_999 = unlist(lapply(sampled_permstats, function(x) quantile(unlist(x), 0.999))),
  stat_99 = unlist(lapply(sampled_permstats, function(x) quantile(unlist(x), 0.99))),
  stat_95 = unlist(lapply(sampled_permstats, function(x) quantile(unlist(x), 0.95))),
  stat_90 = unlist(lapply(sampled_permstats, function(x) quantile(unlist(x), 0.90)))
)
df_99$fitted_999 = loess(stat_999 ~ globalCor, data = df_99)$fitted
df_99$fitted_99 = loess(stat_99 ~ globalCor, data = df_99)$fitted
df_99$fitted_95 = loess(stat_95 ~ globalCor, data = df_99)$fitted
df_99$fitted_90 = loess(stat_90 ~ globalCor, data = df_99)$fitted

g = ggplot(permstatsDF, aes(x = globalCor, y = stat)) + 
  theme_classic() +
  geom_scattermore() +
  geom_point(aes(y = stat_999, colour = "0.999"), data = df_99) + 
  geom_line(aes(y = fitted_999, colour = "0.999"), data = df_99) +
  geom_point(aes(y = stat_99, colour = "0.99"), data = df_99) + 
  geom_line(aes(y = fitted_99, colour = "0.99"), data = df_99) +
  geom_point(aes(y = stat_95, colour = "0.95"), data = df_99) + 
  geom_line(aes(y = fitted_95, colour = "0.95"), data = df_99) +
  geom_point(aes(y = stat_90, colour = "0.90"), data = df_99) + 
  geom_line(aes(y = fitted_90, colour = "0.90"), data = df_99) + 
  scale_colour_manual(values = c("0.999" = tol12qualitative[4],
                                 "0.99" = tol12qualitative[3],
                                 "0.95" = tol12qualitative[2],
                                 "0.90" = tol12qualitative[1])) +
  labs(colour = "Quantile") +
  theme(panel.grid = element_blank()) +
  xlab("Global Correlation") +
  ylab("Permuted test statistics") +
  theme(legend.position = "bottom")
g

ggsave(g, file = "output/stats_globalCor_2d.pdf", height = 8, width = 8)


if (!file.exists("output/2D_p_all.Rdata")) {
  # estimate the set of p-values
  stats_all = DCARSacrossNetwork(counts,
                                 pairs,
                                 W = W,
                                 weightedConcordanceFunction = weightedSpearman,
                                 extractTestStatisticOnly = TRUE,
                                 niter = 1000,
                                 verbose = TRUE)
  
  wcors_all = t(DCARSacrossNetwork(counts,
                                   pairs,
                                   W = W,
                                   weightedConcordanceFunction = weightedSpearman,
                                   extractWcorSequenceOnly = TRUE,
                                   niter = 1000,
                                   verbose = TRUE))
  
  p_all = estimatePvaluesSpearman(stats_all, 
                                  globalCor, 
                                  sampled_permstats,
                                  usenperm = TRUE,
                                  nperm = 10000,
                                  plot = FALSE,
                                  maxDist = 2,
                                  verbose = TRUE)
  p_all$fdr <- p.adjust(p_all$pval, method = "BH")
  
  save(p_all, stats_all, wcors_all, file = "output/2D_p_all.Rdata")
} else {
  load("output/2D_p_all.Rdata")
}

p_all$gene1 = unlist(lapply(strsplit(as.character(p_all$genepair), "_"), "[", 1))
p_all$gene2 = unlist(lapply(strsplit(as.character(p_all$genepair), "_"), "[", 2))

# differentially expressed genes to remove
p_all$spatiallyDE = ifelse(as.character(p_all$gene1) %in% nonDEgenes & as.character(p_all$gene2) %in% nonDEgenes,"No", "DE")

dim(subset(p_all, spatiallyDE == "No"))
## [1] 903  11
nonDEfdr = p_all$pval
nonDEfdr[p_all$spatiallyDE == "DE"] <- NA
nonDEfdr = p.adjust(nonDEfdr, method = "BH")
nonDEfdr[is.na(nonDEfdr)] <- 1
p_all$nonDEfdr = nonDEfdr


getDF = function(gene1, gene2 = NULL) {
  if (length(gene1) > 1) {
    gene2 = gene1[2]
    gene1 = gene1[1]
  }
  
  gpair = paste0(gene1,"_",gene2)
  if (!gpair %in% rownames(wcors_all)) {
    gpair = paste0(gene2,"_",gene1)
  }
  
  wcor = wcors_all[gpair,]
  
  df_res = data.frame(x = coords[,"x"], 
                      y = coords[,"y"], 
                      g1 = expr[gene1,], 
                      g2 = expr[gene2,],
                      wcor = wcor, 
                      W_min = W[which.min(wcor),], 
                      W_max = W[which.max(wcor),])
  return(df_res)
}

basicplotFunction = function(gene1, gene2 = NULL) {
  
  require(ggforce)
  require(patchwork)
  require(ggpubr)
  
  if (length(gene1) > 1) {
    gene2 = gene1[2]
    gene1 = gene1[1]
  }
  
  df_res = getDF(gene1, gene2)
  
  t = theme(legend.key.width = unit(0.5, "inches")) +
    theme(plot.title = element_text(size = 20)) +
    theme(axis.title = element_text(size = 15))
  
  g_gene1 = ggplot(df_res, aes(x = -x, y = y)) + 
    geom_point(aes(colour = g1), size = 5) +
    theme_minimal() +
    theme(panel.grid = element_blank()) +
    theme(axis.text = element_blank()) +
    xlab("") +
    ylab("") +
    ggtitle(gene1) +
    labs(colour = "") +
    theme(legend.position = "bottom") +
    theme(plot.title = element_text(hjust = 0.5, face = "italic")) +
    
    scale_color_viridis_c(breaks = c(0,max(df_res$g1)),
                          limits = c(0,max(df_res$g1)),
                          labels = c("Low","High")) +
    
    t +
    coord_fixed() +
    guides(colour = guide_colourbar(title.position = "top",
                                    title.hjust = 0.5)) +
    theme(legend.title=element_text(size=15)) +
    labs(colour = "Expression") +
    NULL
  
  g_gene2 = ggplot(df_res, aes(x = -x, y = y)) + 
    geom_point(aes(colour = g2), size = 5) +
    theme_minimal() +
    theme(panel.grid = element_blank()) +
    theme(axis.text = element_blank()) +
    xlab("") +
    ylab("") +
    ggtitle(gene2) +
    theme(plot.title = element_text(hjust = 0.5, face = "italic")) +
    labs(colour = "") +
    theme(legend.position = "bottom") +
    scale_color_viridis_c(breaks = c(0,max(df_res$g1)),
                          limits = c(0,max(df_res$g1)),
                          labels = c("Low","High")) +
    t +
    coord_fixed() +
    theme(legend.position = "none") +
    NULL
  
  g2 = ggplot(df_res, aes(x = -x, y = y, fill = wcor)) + 
    geom_voronoi_tile(max.radius = 1) +
    theme_minimal() + 
    theme(panel.grid = element_blank()) +
    theme(axis.text = element_blank()) +
    theme(plot.title = element_text(hjust = 0.5)) +
    theme(legend.position = "bottom") +
    labs(colour = "",fill = "") +
    labs(colour = "Local correlation",fill = "Local correlation") +
    ylab("") +
    xlab("") +
    ggtitle("Correlation of both genes") +
    scale_alpha_continuous(range = c(0,0.5)) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(-1,1)) + 
    t +
    coord_fixed() +
    guides(fill = guide_colourbar(title.position = "top",
                                  title.hjust = 0.5)) +
    theme(legend.title=element_text(size=15)) +
    NULL

  g_gene1_leg = as_ggplot(get_legend(g_gene1))
  g2_leg = as_ggplot(get_legend(g2))
  
  scater::multiplot(g_gene1 + theme(legend.position = "none") + 
                      theme(plot.margin = margin(10,0,-10,0)),
                    g_gene2 + 
                      theme(plot.margin = margin(10,0,-10,0)),
                    g2 + theme(legend.position = "none") + 
                      theme(plot.margin = margin(10,0,-10,0)),
                    g_gene1_leg,
                    g2_leg,
                    layout = matrix(
                      c(1,1,2,2,3,3,
                        1,1,2,2,3,3,
                        1,1,2,2,3,3,
                        6,4,4,6,5,5), ncol = 6, byrow = TRUE))
}

plotFunction = function(gene1, gene2 = NULL) {
  
  require(ggforce)
  require(patchwork)
  
  if (length(gene1) > 1) {
    gene2 = gene1[2]
    gene1 = gene1[1]
  }
  
  df_res = getDF(gene1, gene2)
  
  g_W_min = ggplot(df_res, aes(x = g1, y = g2)) + 
    geom_point(aes(alpha = W_min, size = W_min), colour = "purple") +
    theme_minimal() +
    xlab(gene1) +
    ylab(gene2) +
    ggtitle("Min") +
    NULL
  g_W_max = ggplot(df_res, aes(x = g1, y = g2)) + 
    geom_point(aes(alpha = W_max, size = W_max), colour = "orange") +
    theme_minimal() +
    xlab(gene1) +
    ylab(gene2) +
    ggtitle("Max") +
    NULL
  
  g_xy = ggplot(df_res, aes(x = x, y = y)) + 
    geom_point(size = 0.1) +
    geom_density_2d(data = subset(df_res, W_max != 0), colour = "orange") +
    geom_density_2d(data = subset(df_res, W_min != 0), colour = "purple") +
    theme_minimal() +
    xlab("x coordinate") +
    ylab("y coordinate") +
    ggtitle("Positions") +
    NULL
  
  g_gene1 = ggplot(df_res, aes(x = x, y = y)) + 
    geom_point(aes(colour = g1), size = 5) +
    
    theme_minimal() +
    ggtitle(gene1) +
    scale_color_gradient2(low = "black", mid = "yellow", high = "red", midpoint = 2) +
    NULL
  
  g_gene2 = ggplot(df_res, aes(x = x, y = y)) + 
    geom_point(aes(colour = g2), size = 5) +
    
    theme_minimal() +
    ggtitle(gene2) +
    scale_color_gradient2(low = "black", mid = "yellow", high = "red", midpoint = 2) +
    NULL
  
  g2 = ggplot(df_res, aes(x = x, y = y, fill = wcor)) + 
    geom_voronoi_tile(max.radius = 1) +
    geom_point(size = 1, colour = "black") +
    theme_minimal() + 
    scale_alpha_continuous(range = c(0,0.5)) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(-1,1)) + 
    NULL
  
  
  return(g_gene1 + g_gene2 + g2 + 
           g_W_min + g_W_max + g_xy + plot_layout(ncol = 3, nrow = 2, byrow = TRUE))
}

FDR_level = 0.2

wcorsSig = wcors_all[p_all$nonDEfdr < FDR_level,]
pairsSig = pairs[p_all$nonDEfdr < FDR_level,]

dim(wcorsSig)
## [1] 167 262
# basic plot functions of sig pairs
apply(pairsSig,1, function(p) {
  #print(p)
  if (!file.exists("output/basicplots")) {
    system("mkdir output/basicplots")
  }
  pdf(paste0("output/basicplots/", p[1], "_", p[2], ".pdf"),
      height = 4.5, width = 12,onefile=FALSE)
  basicplotFunction(p)
  dev.off()
})
##       Abat_Arrb1        Abat_Cst3        Abat_Mlc1       Abat_Myo10 
##                2                2                2                2 
##      Abat_Slc1a3       Abat_Tiam1     Actb_Gucy1b3       Actb_Itm2c 
##                2                2                2                2 
##       Actb_Myo10      Actb_Smim13         Actb_Sp8       Actb_Tiam1 
##                2                2                2                2 
##       Actb_Vdac3        Arrb1_Cnp       Arrb1_Cst3       Arrb1_Dnm3 
##                2                2                2                2 
##       Arrb1_Mtor     Arrb1_Prune2     Arrb1_Smim13        Arrb1_Sp8 
##                2                2                2                2 
##     Arrb1_Tmem47      Arrb1_Uchl1   Atp6v1c1_Calm1    Atp6v1c1_Cst3 
##                2                2                2                2 
##   Atp6v1c1_Luzp2    Atp6v1c1_Mlc1   Atp6v1c1_Myo10 Atp6v1c1_Plekha1 
##                2                2                2                2 
##     Atp6v1c1_Sp8   Atp6v1c1_Tiam1  Atp6v1c1_Tmsb4x     Atrnl1_Ccng1 
##                2                2                2                2 
##   Atrnl1_Gm15421     Atrnl1_Myo10       Bai1_Ccng1        Bai1_Cst3 
##                2                2                2                2 
##     Bai1_Gucy1b3      Bai1_Slc1a3       Bai1_Tiam1       Calm1_Cst3 
##                2                2                2                2 
##      Calm1_Dnm1l    Calm1_Gucy1b3    Calm1_Plekha1     Calm1_Smim13 
##                2                2                2                2 
##        Calm1_Sp8      Calm1_Tiam1      Calm1_Vdac3      Calm1_Wdfy3 
##                2                2                2                2 
##     Ccng1_Cldn11    Ccng1_Erbb2ip       Ccng1_Scd2     Ccng1_Smim13 
##                2                2                2                2 
##       Cldn11_Cnp      Cldn11_Cst3   Cldn11_Gucy1b3     Cldn11_Ildr2 
##                2                2                2                2 
##     Cldn11_Itm2c      Cldn11_Scd2    Cldn11_Tuba1a     Cldn11_Vdac3 
##                2                2                2                2 
##        Cnp_Itm2c       Cnp_Prune2       Cnp_Tmsb4x     Cst3_Gucy1b3 
##                2                2                2                2 
##      Cst3_Impact       Cst3_Itm2c       Cst3_Luzp2        Cst3_Mtor 
##                2                2                2                2 
##       Cst3_Myo10      Cst3_Prrc2b      Cst3_Prune2      Cst3_Smim13 
##                2                2                2                2 
##         Cst3_Sp8      Cst3_Tuba1a       Cst3_Uchl1       Cst3_Wdfy3 
##                2                2                2                2 
##       Dnm1l_Dnm3     Dnm1l_Fam63b    Dnm1l_Plekha1        Dnm1l_Sp8 
##                2                2                2                2 
##      Dnm1l_Vdac3      Dnm3_Fam63b       Dnm3_Tiam1    Gm15421_Ildr2 
##                2                2                2                2 
##    Gm15421_Itm2c   Gm15421_Tmsb4x    Gucy1b3_Ildr2     Gucy1b3_Mlc1 
##                2                2                2                2 
##    Gucy1b3_Myo10  Gucy1b3_Plekha1   Gucy1b3_Smim13    Gucy1b3_Tiam1 
##                2                2                2                2 
##   Gucy1b3_Tmem47   Gucy1b3_Tmsb4x   Gucy1b3_Tuba1a    Gucy1b3_Uchl1 
##                2                2                2                2 
##        Igf2_Mtor        Igf2_Pmm1      Ildr2_Itm2c       Ildr2_Mlc1 
##                2                2                2                2 
##    Ildr2_Plekha1      Ildr2_Tiam1     Ildr2_Tuba1a      Impact_Mlc1 
##                2                2                2                2 
##     Impact_Myo10     Impact_Ntng1   Impact_Plekha1     Impact_Tiam1 
##                2                2                2                2 
##    Impact_Tmem47      Itm2c_Myo10       Itm2c_Pmm1     Itm2c_Prrc2b 
##                2                2                2                2 
##     Itm2c_Smim13        Itm2c_Sp8      Itm2c_Tiam1     Itm2c_Tuba1a 
##                2                2                2                2 
##      Itm2c_Uchl1      Itm2c_Vdac3      Luzp2_Uchl1        Mlc1_Mtor 
##                2                2                2                2 
##       Mlc1_Myo10      Mlc1_Prune2      Mlc1_Slc1a3       Mlc1_Uchl1 
##                2                2                2                2 
##       Mtor_Myo10      Mtor_Slc1a3      Mtor_Tmem47       Mtor_Vdac3 
##                2                2                2                2 
##       Mtor_Wdfy3     Myo10_Prune2       Myo10_Scd2     Myo10_Slc1a3 
##                2                2                2                2 
##        Myo10_Sp8      Myo10_Tiam1      Myo10_Uchl1      Ntng1_Tiam1 
##                2                2                2                2 
##       Ntng1_Utrn    Plekha1_Tiam1   Plekha1_Tmsb4x    Plekha1_Vdac3 
##                2                2                2                2 
##       Prrc2b_Sp8     Prrc2b_Tiam1     Prrc2b_Vdac3     Prune2_Tiam1 
##                2                2                2                2 
##    Prune2_Tuba1a     Prune2_Wdfy3      Scd2_Smim13         Scd2_Sp8 
##                2                2                2                2 
##       Smim13_Sp8    Smim13_Tmem47    Smim13_Tmsb4x    Smim13_Tuba1a 
##                2                2                2                2 
##     Smim13_Vdac3       Sp8_Tmsb4x       Sp8_Tuba1a        Sp8_Uchl1 
##                2                2                2                2 
##        Sp8_Vdac3     Tiam1_Tmsb4x     Tiam1_Tuba1a      Tiam1_Vdac3 
##                2                2                2                2 
##      Tiam1_Wdfy3    Tmsb4x_Tuba1a     Tmsb4x_Vdac3     Tuba1a_Uchl1 
##                2                2                2                2 
##     Tuba1a_Vdac3      Uchl1_Vdac3      Vdac3_Wdfy3 
##                2                2                2

Interrogate significant gene pairs

cellsCut = cutree(hclust(cordist(t(wcorsSig))), 20)
table(cellsCut)
## cellsCut
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
## 15 16 19 10  7 12 18  4  6 27  6  9 36 10 12  8 12  8  7 20
sort(table(cellsCut))
## cellsCut
##  8  9 11  5 19 16 18 12  4 14  6 15 17  1  2  7  3 20 10 13 
##  4  6  6  7  7  8  8  9 10 10 12 12 12 15 16 18 19 20 27 36
df_res = getDF(pairs[1,])

df_res$cellsCut <- factor(cellsCut)
ggplot(df_res, aes(x = x, y = y, fill = cellsCut, colour = cellsCut)) + 
  geom_point(size = 7, shape = 21, stroke = 1.5, colour = "black") +
  theme_minimal() + 
  scale_alpha_continuous(range = c(0,0.5)) +
  NULL

ggsave(file = "output/cellsCluster_spatial.pdf", height = 8, width = 10)

hc = hclust(dist(wcorsSig, method = "maximum"), method = "complete")

genepairsClustDynamic = cutreeDynamic(
  hc, 
  minClusterSize = 10, 
  method = "tree",
  deepSplit = TRUE,
  useMedoids = FALSE
)
kk = length(unique(genepairsClustDynamic))-1
kk
## [1] 9
genepairsClust = cutree(hc, k = kk)
plot(hc)

table(genepairsClust)
## genepairsClust
##  1  2  3  4  5  6  7  8  9 
## 18 35 10 16 19 24 33  8  4
sort(table(genepairsClust))
## genepairsClust
##  9  8  3  4  1  5  6  7  2 
##  4  8 10 16 18 19 24 33 35
p_sig = cbind(p_all[names(genepairsClust),], cluster = genepairsClust)
saveRDS(p_sig, file = "output/p_sig.Rds")

write.table(as.matrix(sort_df(p_sig, c("cluster","genepair"))),
            file = "output/sig_genepairs.tsv", row.names = FALSE,
            col.names = TRUE, quote = FALSE, sep = "\t")

write.table(as.matrix(genepairsClust), file = "output/genepairsClust.tsv", row.names = TRUE,
            col.names = FALSE, quote = FALSE, sep = "\t")

genepairs_split = lapply(split(names(genepairsClust), genepairsClust), function(x) t(do.call(cbind, strsplit(x, "_"))))
genepairs_split_genes = lapply(genepairs_split, function(x) sort(unique(c(x))))

sapply(names(genepairs_split_genes), function(i){
  write(genepairs_split_genes[[i]], file = paste0("output/cluster_", i,".txt"))
})
## $`1`
## NULL
## 
## $`2`
## NULL
## 
## $`3`
## NULL
## 
## $`4`
## NULL
## 
## $`5`
## NULL
## 
## $`6`
## NULL
## 
## $`7`
## NULL
## 
## $`8`
## NULL
## 
## $`9`
## NULL
geneClustMembers = sapply(unique(genepairsClust), function(x) 
  unique(c(pairsSig[genepairsClust == x,])), simplify = FALSE)
names(geneClustMembers) <- paste0("cluster_", unique(genepairsClust))
saveRDS(geneClustMembers, file = "output/geneClustMembers.Rds")

jacDist_pairs = expand.grid(names(geneClustMembers),names(geneClustMembers))
jacDist_vals = apply(jacDist_pairs,1,function(x){
  if (x[1] == x[2]) return(NA)
  set1 = geneClustMembers[[x[1]]]
  set2 = geneClustMembers[[x[2]]]
  return(length(intersect(set1,set2))/length(union(set1,set2)))
})
jacDist = as.matrix(reshape::cast(cbind(jacDist_pairs, jacDist_vals), formula = Var2 ~ Var1, value = "jacDist_vals"))

pdf("output/jacDist_clusters.pdf", height = 8, width = 8)
heatmap.2(jacDist, trace = "n", main = "Jaccard distance of genes within clusters", 
          density.info = "none",
          key.title = "",
          key.xlab = "Jaccard distance",
          symm = TRUE, 
          revC = TRUE)
dev.off()
## quartz_off_screen 
##                 2
meanwcorsSig = apply(wcorsSig, 2, function(x) {
  tapply(x, genepairsClust, mean)
})
rownames(meanwcorsSig) <- paste0("cluster_", 1:length(unique(genepairsClust)))

pdf("output/meanGenes_heatmap.pdf", height = 12, width = 24)
heatmap.2(meanwcorsSig, trace = "n", col = colorRampPalette(c("blue","white","red")),
          key.title = "",
          key.xlab = "Mean weighted correlation",
          main = "Mean weighted correlation of clustered genepairs")
dev.off()
## quartz_off_screen 
##                 2
df_res2 <- getDF(pairs[1,])
df_res2 <- cbind(df_res2, t(meanwcorsSig))

gList_cells <- sapply(rownames(meanwcorsSig), function(name) {
  ggplot(df_res2, aes(x = -x, y = y, fill = get(name))) + 
    geom_voronoi_tile(max.radius = 1) +
    geom_point(size = 0.5, colour = "black") +
    theme_minimal() + 
    scale_alpha_continuous(range = c(0,0.5)) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(-1,1)) + 
    ggtitle("") +
    theme(legend.position = "none") +
    theme(panel.grid = element_blank()) +
    theme(axis.ticks = element_blank()) +
    theme(axis.text = element_blank()) +
    xlab("") + ylab("") +
    theme(plot.title = element_text(hjust = 0.5)) +
    coord_fixed() +
    NULL
}, simplify = FALSE)

gAll = patchwork::wrap_plots(gList_cells, ncol = length(gList_cells)/5, nrow = 5)
gAll

ggsave(gAll, file = "output/genepairsClust_wcors.pdf", height = 5, width = 44)

hc_mean_cells = hclust(dist(t(wcorsSig), method = "euclidean"), method = "complete")

hc_mean_cells_groups = cutreeDynamic(
  hc_mean_cells, 
  minClusterSize = 10, 
  method = "tree",
  deepSplit = FALSE,
  useMedoids = FALSE
)

kk_cells = length(unique(hc_mean_cells_groups))-1
kk_cells
## [1] 8
hc_mean_cells_groups = cutree(hc_mean_cells, kk_cells)
plot(hc_mean_cells)

plot(hc_mean_cells, labels = hc_mean_cells_groups)

hc_mean_cells_fac = factor(hc_mean_cells_groups, levels = unique(hc_mean_cells_groups[hc_mean_cells$order]))
df_hc_mean_cells = data.frame(
  x = coords[,1],
  y = coords[,2],
  hc_mean_cells = hc_mean_cells_fac
)
s = 1
g_cells = ggplot(df_hc_mean_cells, aes(x = -x, y = y)) + 
  geom_point(data = df_hc_mean_cells[,1:2], alpha = 0.2, stroke = 0, size = s, pch = 16) + 
  geom_point(stroke = 0, size = s, pch = 16) +
  facet_grid(hc_mean_cells~.) + 
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.text = element_blank()) +
  xlab("") +
  ylab("") +
  coord_fixed() +
  NULL
g_cells

ggsave(g_cells, file = "output/split_heatmap_cells.pdf",height = 7, width = 1.5)

hc_mean_genepairs = hc
hc_mean_genepairs_groups = genepairsClust

plot(hc_mean_genepairs)

plot(hc_mean_genepairs, labels = hc_mean_genepairs_groups)

hc_mean_genepairs_fac = factor(hc_mean_genepairs_groups, levels = unique(hc_mean_genepairs_groups[hc_mean_genepairs$order]))

split(names(hc_mean_genepairs_fac), hc_mean_genepairs_fac)
## $`5`
##  [1] "Actb_Gucy1b3"   "Bai1_Gucy1b3"   "Calm1_Gucy1b3"  "Calm1_Vdac3"   
##  [5] "Cldn11_Gucy1b3" "Cldn11_Tuba1a"  "Cldn11_Vdac3"   "Cnp_Prune2"    
##  [9] "Cst3_Gucy1b3"   "Dnm1l_Sp8"      "Gucy1b3_Mlc1"   "Gucy1b3_Tuba1a"
## [13] "Impact_Tiam1"   "Itm2c_Pmm1"     "Itm2c_Prrc2b"   "Mlc1_Prune2"   
## [17] "Prune2_Tiam1"   "Sp8_Uchl1"      "Tuba1a_Vdac3"  
## 
## $`7`
##  [1] "Actb_Tiam1"      "Actb_Vdac3"      "Atp6v1c1_Calm1"  "Atp6v1c1_Tmsb4x"
##  [5] "Calm1_Plekha1"   "Calm1_Wdfy3"     "Cldn11_Cnp"      "Cldn11_Scd2"    
##  [9] "Cnp_Itm2c"       "Cnp_Tmsb4x"      "Cst3_Luzp2"      "Dnm1l_Plekha1"  
## [13] "Dnm1l_Vdac3"     "Gm15421_Itm2c"   "Gm15421_Tmsb4x"  "Gucy1b3_Ildr2"  
## [17] "Gucy1b3_Tiam1"   "Gucy1b3_Tmsb4x"  "Ildr2_Itm2c"     "Ildr2_Tiam1"    
## [21] "Ildr2_Tuba1a"    "Itm2c_Tiam1"     "Itm2c_Vdac3"     "Mlc1_Slc1a3"    
## [25] "Mtor_Wdfy3"      "Plekha1_Tmsb4x"  "Plekha1_Vdac3"   "Prrc2b_Vdac3"   
## [29] "Smim13_Vdac3"    "Tiam1_Tmsb4x"    "Tmsb4x_Tuba1a"   "Tmsb4x_Vdac3"   
## [33] "Vdac3_Wdfy3"    
## 
## $`6`
##  [1] "Actb_Itm2c"     "Actb_Smim13"    "Actb_Sp8"       "Atp6v1c1_Cst3" 
##  [5] "Atp6v1c1_Luzp2" "Atp6v1c1_Mlc1"  "Atp6v1c1_Sp8"   "Calm1_Dnm1l"   
##  [9] "Calm1_Smim13"   "Calm1_Sp8"      "Cst3_Smim13"    "Gucy1b3_Smim13"
## [13] "Itm2c_Smim13"   "Itm2c_Sp8"      "Itm2c_Tuba1a"   "Prrc2b_Sp8"    
## [17] "Prune2_Tuba1a"  "Scd2_Smim13"    "Scd2_Sp8"       "Smim13_Tmsb4x" 
## [21] "Smim13_Tuba1a"  "Sp8_Tmsb4x"     "Sp8_Tuba1a"     "Sp8_Vdac3"     
## 
## $`4`
##  [1] "Abat_Tiam1"       "Atp6v1c1_Plekha1" "Atp6v1c1_Tiam1"   "Atrnl1_Ccng1"    
##  [5] "Bai1_Tiam1"       "Calm1_Tiam1"      "Cldn11_Ildr2"     "Gucy1b3_Plekha1" 
##  [9] "Igf2_Pmm1"        "Ildr2_Plekha1"    "Impact_Ntng1"     "Impact_Plekha1"  
## [13] "Plekha1_Tiam1"    "Prrc2b_Tiam1"     "Tiam1_Tuba1a"     "Tiam1_Vdac3"     
## 
## $`1`
##  [1] "Abat_Arrb1"     "Actb_Myo10"     "Arrb1_Cnp"      "Arrb1_Dnm3"    
##  [5] "Arrb1_Mtor"     "Arrb1_Prune2"   "Arrb1_Smim13"   "Arrb1_Sp8"     
##  [9] "Arrb1_Tmem47"   "Arrb1_Uchl1"    "Atp6v1c1_Myo10" "Atrnl1_Myo10"  
## [13] "Cst3_Myo10"     "Dnm1l_Fam63b"   "Gucy1b3_Tmem47" "Luzp2_Uchl1"   
## [17] "Mlc1_Mtor"      "Myo10_Uchl1"   
## 
## $`2`
##  [1] "Abat_Cst3"     "Abat_Mlc1"     "Abat_Slc1a3"   "Arrb1_Cst3"   
##  [5] "Bai1_Cst3"     "Bai1_Slc1a3"   "Calm1_Cst3"    "Cldn11_Cst3"  
##  [9] "Cldn11_Itm2c"  "Cst3_Impact"   "Cst3_Itm2c"    "Cst3_Mtor"    
## [13] "Cst3_Prrc2b"   "Cst3_Prune2"   "Cst3_Sp8"      "Cst3_Tuba1a"  
## [17] "Cst3_Uchl1"    "Cst3_Wdfy3"    "Dnm1l_Dnm3"    "Gucy1b3_Uchl1"
## [21] "Ildr2_Mlc1"    "Impact_Mlc1"   "Impact_Tmem47" "Itm2c_Uchl1"  
## [25] "Mlc1_Uchl1"    "Mtor_Slc1a3"   "Mtor_Tmem47"   "Mtor_Vdac3"   
## [29] "Myo10_Slc1a3"  "Ntng1_Utrn"    "Prune2_Wdfy3"  "Smim13_Sp8"   
## [33] "Smim13_Tmem47" "Tuba1a_Uchl1"  "Uchl1_Vdac3"  
## 
## $`8`
## [1] "Bai1_Ccng1"    "Ccng1_Smim13"  "Dnm3_Fam63b"   "Gucy1b3_Myo10"
## [5] "Impact_Myo10"  "Myo10_Sp8"     "Myo10_Tiam1"   "Ntng1_Tiam1"  
## 
## $`9`
## [1] "Ccng1_Cldn11"  "Ccng1_Erbb2ip" "Ccng1_Scd2"    "Igf2_Mtor"    
## 
## $`3`
##  [1] "Abat_Myo10"     "Atrnl1_Gm15421" "Dnm3_Tiam1"     "Gm15421_Ildr2" 
##  [5] "Itm2c_Myo10"    "Mlc1_Myo10"     "Mtor_Myo10"     "Myo10_Prune2"  
##  [9] "Myo10_Scd2"     "Tiam1_Wdfy3"
write.table(sort_df(data.frame(grouping = hc_mean_genepairs_fac, cluster = names(hc_mean_genepairs_fac)), "grouping"), file = "output/hc_mean_genepairs_fac_split.txt", col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")

hc_mean_meanwcorsSig = apply(wcorsSig, 2, function(x)
  tapply(x, hc_mean_genepairs_fac, mean)
)

df_hc_mean_genepairs = data.frame(
  x = rep(coords[,1],times = nrow(hc_mean_meanwcorsSig)),
  y = rep(coords[,2],times = nrow(hc_mean_meanwcorsSig)),
  hc_mean_genepairs = factor(rep(levels(hc_mean_genepairs_fac), each = ncol(meanwcorsSig)),
                             levels = levels(hc_mean_genepairs_fac)),
  hc_mean_genepairs_wc = c(t(hc_mean_meanwcorsSig))
)

g_mean = ggplot(df_hc_mean_genepairs, aes(x = -x, y = y, colour = hc_mean_genepairs_wc)) + 
  geom_point(size = 0.7) +
  facet_grid(~hc_mean_genepairs) + 
  theme_minimal() +
  theme(panel.grid = element_blank(),
        axis.text = element_blank()) +
  scale_colour_gradient2(low = "blue", mid = "white", high = "red", limits = c(-1,1)) + 
  theme(legend.position = "none") +
  xlab("") +
  ylab("") +
  coord_fixed() +
  NULL
g_mean

cowplot::ggsave2(g_mean, file = "output/split_heatmap_genepairs.pdf", height = 1.3, width = 9)

pdf("output/split_heatmap.pdf",height = 10, width = 15)
tmat = t(wcorsSig)
colnames(tmat) <- gsub("cluster_","", colnames(tmat))
h = ComplexHeatmap::Heatmap(tmat, 
                            cluster_columns = hc_mean_genepairs,
                            cluster_rows = hc_mean_cells,
                            row_dend_reorder = FALSE,
                            column_dend_reorder = FALSE,
                            row_split = kk_cells,
                            column_split = kk,
                            show_heatmap_legend = FALSE,
                            show_column_names = FALSE
)
print(h)
dev.off()
## quartz_off_screen 
##                 2
grob = grid.grabExpr(draw(h))

tmatmean = t(apply(tmat, 1, function(x) tapply(x, hc_mean_genepairs_fac, mean)))

hh = ComplexHeatmap::Heatmap(tmatmean, cluster_columns = FALSE,
                             cluster_rows = hc_mean_cells,
                             row_dend_reorder = FALSE,
                             column_dend_reorder = FALSE,
                             row_split = kk_cells,
                             show_heatmap_legend = FALSE,
                             show_column_names = FALSE,
                             column_title = "Gene pair clusters",
                             row_title = "Cells",
                             row_title_gp = gpar(fontsize = 20),
                             column_title_gp = gpar(fontsize = 20),
                             border = "black"
)
hh

grobh = grid.grabExpr(draw(hh))

pdf("output/split_heatmap_combined.pdf",height = 7, width = 11, useDingbats = FALSE)
cowplot::plot_grid(grob, 
                   g_cells + theme(plot.margin = margin(2,0.5,0.2,-0.5,unit = "cm")) +
                     theme(strip.text = element_blank()), 
                   g_mean + theme(plot.margin = margin(0,0,0,1.5, unit = "cm")) +
                     theme(strip.text = element_blank()) +
                     NULL
                   , 
                   ncol = 2,
                   rel_heights = c(7,1),
                   rel_widths = c(9,1))
dev.off()
## quartz_off_screen 
##                 2
pdf("output/split_heatmap_combined_summarised.pdf",height = 8, width = 9, useDingbats = FALSE)
cowplot::plot_grid(grobh, 
                   g_cells + 
                     theme(plot.margin = margin(1.3,0.5,-0.2,-0.5,unit = "cm")) +
                     theme(strip.text = element_blank()), 
                   g_mean + 
                     theme(plot.margin = margin(0,0,0,1.5, unit = "cm")) +
                     theme(strip.text = element_blank()), 
                   ncol = 2,
                   rel_heights = c(8,1),
                   rel_widths = c(8,1))

dev.off()
## quartz_off_screen 
##                 2

Gene ontology enrichment testing

if (!file.exists("output/GO_list.Rds")) {
  library(GO.db)
  library(org.Mm.eg.db)
  keys = keys(org.Mm.eg.db)
  columns(org.Mm.eg.db)
  GO_info = select(org.Mm.eg.db, keys=keys, columns = c("SYMBOL", "GO"))
  
  keep = GO_info$SYMBOL %in% rownames(counts)
  table(keep)
  GO_info_filt = GO_info[keep,]
  
  # at least 10 genes in the term and less than 500
  # allTerms = names(which(table(GO_info_filt$GO) >= 10 & table(GO_info_filt$GO) <= 500))
  sizeTerms = names(which(table(GO_info_filt$GO) >= 10 & table(GO_info_filt$GO) <= 500))
  
  # also have at least one of the genes which we actually test
  testTerms = subset(GO_info_filt, SYMBOL %in% intersect(HVG, nonDEgenes))$GO
  
  allTerms = intersect(sizeTerms, testTerms)
  
  GO_info_terms = select(GO.db, columns = columns(GO.db), keys = allTerms)
  rownames(GO_info_terms) <- GO_info_terms$GOID
  
  allTermNames = GO_info_terms[allTerms, "TERM"]
  names(allTermNames) <- allTerms
  
  GO_list = sapply(allTerms, function(term) {
    print(term)
    genes = GO_info_filt[GO_info_filt$GO == term, "SYMBOL"]
    return(sort(unique(genes[!is.na(genes)])))
  }, simplify = FALSE)
  names(GO_list) <- allTermNames
  
  saveRDS(GO_list, file = "output/GO_list.Rds")
} else {
  GO_list = readRDS("output/GO_list.Rds")
}

if (!file.exists("output/superclusterstotest_GO.Rds")) {
  superclusterstotest_GO = lapply(geneClustMembers, function(set) {
    print("testing..")
    genesetGOtest(set, rownames(counts), GO_list)
  }
  )
  saveRDS(superclusterstotest_GO, file = "output/superclusterstotest_GO.Rds")
} else {
  superclusterstotest_GO <- readRDS("output/superclusterstotest_GO.Rds")
}

gList = lapply(superclusterstotest_GO, function(pval) {
  df = data.frame(term = factor(names(pval), levels = c(names(pval), "")),
                  pval = pval,
                  qval = p.adjust(pval, method = "BH"))
  df$label = df$term
  df$label[pval != 1] <- ""
  df_sorted = sort_df(df, "pval")[1:10,]
  df_sorted$term = factor(df_sorted$term, levels =  rev(df_sorted$term))
  
  g = ggplot(df_sorted, aes(x = term, y = -log10(pval), fill = qval < 0.05)) +
    theme_classic() +
    geom_col() +
    coord_flip() +
    xlab("") +
    ylab("-log10(P-value)") +
    # geom_hline(yintercept = -log10(0.01), colour = "red", linetype = "dashed", size = 1) +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) +
    scale_fill_manual(values = c("TRUE" = "dimgrey", "FALSE" = "peachpuff")) +
    theme(legend.position = "none") +
    NULL
  return(g)
  
})

gTogether = sapply(seq_len(length(gList)), function(i){
  
  g_1 = gList[[i]] + 
    theme(axis.text = element_text(size = 12))
  n = ggtitle(paste0("Cluster ",i)) 
  g_2 = gList_cells[[i]] + theme(title = element_text(size = 20)) + 
    coord_fixed()
  
  pdf(paste0("output/go_cluster_plot_cluster_",i,".pdf"), 
      height = 7, width = 4.5)
  scater::multiplot(g_2 + n, g_1 + ylab(""), layout = matrix(c(1,1,2,2,2), ncol = 1))
  dev.off()
  
  return(g_1 + g_2)
}, simplify = FALSE)

Compare different span choices

Using script run on cluster due to high computational demand. Note that the file span_output.RData is around 60MB.

if (!file.exists("output/span_output.RData")) {
  source("span_MOB.R")
}
load("output/span_output.RData")

span_p_all_all = do.call(rbind, span_p_all)
span_p_all_all$testing <- rep(names(span_p_all), 
                                         times = unlist(lapply(span_p_all, nrow)))

span_pvals = do.call(cbind,lapply(span_p_all, "[", "pval"))
colnames(span_pvals) <- names(span_p_all)

span_cor = cor(-log10(span_pvals), method = "spearman")

g = ggpairs(-log10(span_pvals),
            
            lower = list(continuous = wrap("points", alpha = 0.3, size=0.1), 
                         combo = wrap("dot", alpha = 0.4, size=0.2)),
            title = "-log10(P-value)"
)

g

ggsave(g, file = "output/span_pairs.pdf", height = 12, width = 14)


pdf("output/span_corrplot.pdf", height = 10, width = 10)
g_cor = corrplot(span_cor, order = "original")
dev.off()
## quartz_off_screen 
##                 2
pdf("output/span_upset.pdf", height = 10, width = 16, onefile = FALSE)
upset(data = as.data.frame(as.matrix(1*(apply(span_pvals,2,p.adjust, method = "BH") < 0.2))),
      sets = colnames(span_pvals),
      order.by = "freq",
      text.scale = 2)
dev.off()
## quartz_off_screen 
##                 2
# distribution graphs
span_distn = data.frame(
    quantile_999 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.999)))),
    quantile_90 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.90)))),
    quantile_95 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.95)))),
    quantile_99 = unlist(lapply(span_perms, function(x) unlist(lapply(x, quantile, probs = 0.99)))),
    global = unlist(lapply(span_global, "[", span_sampled_ind)),
    span = rep(names(span_perms), times = unlist(lapply(span_perms, length)))
)
span_distn <- reshape::sort_df(span_distn, "global")

span_distn$quantile_999_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                                           function(df){
                                                               loess(quantile_999 ~ global, data = df)$fitted
                                                           }), span_distn$span)

span_distn$quantile_90_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                                          function(df){
                                                              loess(quantile_90 ~ global, data = df)$fitted
                                                          }), span_distn$span)

span_distn$quantile_95_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                                          function(df){
                                                              loess(quantile_95 ~ global, data = df)$fitted
                                                          }), span_distn$span)

span_distn$quantile_99_fitted = unsplit(lapply(split.data.frame(span_distn, span_distn$span),
                                                          function(df){
                                                              loess(quantile_99 ~ global, data = df)$fitted
                                                          }), span_distn$span)

span_distn$span = factor(span_distn$span,levels = names(span_perms))

span_nicecolours = RColorBrewer::brewer.pal(name = "Blues", 9)
names(span_nicecolours) <- c(rep("", 5), "0.90","0.95","0.99","0.999")
g = ggplot(span_distn, aes(x = global))  + 
    
    geom_point(aes(y = quantile_999, colour = "0.999"),
               alpha = 0.5) +#, colour = span_nicecolours[9]) +
    geom_line(aes(y = quantile_999_fitted), colour = span_nicecolours[9]) +
    
    geom_point(aes(y = quantile_99, colour = "0.99"),
               alpha = 0.5) + #, colour = span_nicecolours[8]) +
    geom_line(aes(y = quantile_99_fitted), colour = span_nicecolours[8]) +
    
    geom_point(aes(y = quantile_95, colour = "0.95"),
               alpha = 0.5) + #, colour = span_nicecolours[7]) +
    geom_line(aes(y = quantile_95_fitted), colour = span_nicecolours[7]) +
    
    geom_point(aes(y = quantile_90, colour = "0.90"),
               alpha = 0.5) + #, colour = span_nicecolours[6]) +
    geom_line(aes(y = quantile_90_fitted), colour = span_nicecolours[6]) +
    facet_wrap(~span, scales = "free", ncol = 4) + 
    theme_classic() + 
    xlab("Global higher order statistic value") +
    ylab("Permuted scHOT test statistic quantile") +
    scale_colour_manual(name="Quantiles",values=span_nicecolours[6:9]) +
    theme(legend.position = "bottom") +
    NULL
g

ggsave(g, file = "output/span_pvalEstimation.pdf",
       height = 12, width = 14)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin18.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.6_1/lib/libopenblasp-r0.3.6.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0                corrplot_0.84              
##  [3] GGally_1.4.0                cowplot_1.0.0              
##  [5] ggpubr_0.2.5                magrittr_1.5               
##  [7] patchwork_1.0.0             ggforce_0.3.1              
##  [9] stringr_1.4.0               org.Mm.eg.db_3.10.0        
## [11] GO.db_3.10.0                AnnotationDbi_1.48.0       
## [13] ComplexHeatmap_2.2.0        dynamicTreeCut_1.63-1      
## [15] scattermore_0.4             reshape_0.8.8              
## [17] scran_1.14.6                scater_1.14.6              
## [19] SingleCellExperiment_1.8.0  SummarizedExperiment_1.16.1
## [21] DelayedArray_0.12.2         BiocParallel_1.20.1        
## [23] matrixStats_0.55.0          Biobase_2.46.0             
## [25] GenomicRanges_1.38.0        GenomeInfoDb_1.22.0        
## [27] IRanges_2.20.2              S4Vectors_0.24.3           
## [29] BiocGenerics_0.32.0         gplots_3.0.1.2             
## [31] ggplot2_3.2.1               DCARS_0.3.5                
## 
## loaded via a namespace (and not attached):
##  [1] ggbeeswarm_0.6.0         colorspace_1.4-1         deldir_0.1-25           
##  [4] ggsignif_0.6.0           rjson_0.2.20             circlize_0.4.8          
##  [7] XVector_0.26.0           GlobalOptions_0.1.1      BiocNeighbors_1.4.1     
## [10] clue_0.3-57              farver_2.0.3             bit64_0.9-7             
## [13] codetools_0.2-16         knitr_1.28               polyclip_1.10-0         
## [16] cluster_2.1.0            png_0.1-7                compiler_3.6.1          
## [19] dqrng_0.2.1              assertthat_0.2.1         Matrix_1.2-18           
## [22] lazyeval_0.2.2           limma_3.42.2             tweenr_1.0.1            
## [25] BiocSingular_1.2.2       htmltools_0.4.0          tools_3.6.1             
## [28] rsvd_1.0.3               igraph_1.2.4.2           gtable_0.3.0            
## [31] glue_1.3.1               GenomeInfoDbData_1.2.2   reshape2_1.4.3          
## [34] dplyr_0.8.4              Rcpp_1.0.3               vctrs_0.2.2             
## [37] gdata_2.18.0             DelayedMatrixStats_1.8.0 xfun_0.12               
## [40] lifecycle_0.1.0          irlba_2.3.3              gtools_3.8.1            
## [43] statmod_1.4.34           edgeR_3.28.0             zlibbioc_1.32.0         
## [46] MASS_7.3-51.5            scales_1.1.0             RColorBrewer_1.1-2      
## [49] yaml_2.2.1               memoise_1.1.0            gridExtra_2.3           
## [52] stringi_1.4.6            RSQLite_2.2.0            caTools_1.18.0          
## [55] shape_1.4.4              rlang_0.4.4              pkgconfig_2.0.3         
## [58] bitops_1.0-6             evaluate_0.14            lattice_0.20-40         
## [61] purrr_0.3.3              labeling_0.3             bit_1.1-15.2            
## [64] tidyselect_1.0.0         plyr_1.8.5               R6_2.4.1                
## [67] DBI_1.1.0                pillar_1.4.3             withr_2.1.2             
## [70] RCurl_1.98-1.1           tibble_2.1.3             crayon_1.3.4            
## [73] KernSmooth_2.23-16       rmarkdown_2.1            viridis_0.5.1           
## [76] GetoptLong_0.1.8         locfit_1.5-9.1           blob_1.2.1              
## [79] digest_0.6.24            munsell_0.5.0            beeswarm_0.2.3          
## [82] viridisLite_0.3.0        vipor_0.4.5